library(reshape2) # melt function
library(ggplot2) # ggplot function
library(pcaPP) # Fast Kendall function
library(energy) # Distance Correlation
library(Hmisc) # Hoeffding's D measure
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(entropy) # Mutual Information
library(minerva) # Maximum Information Coefficient
library(XICOR) # Chatterjee's Coefficient
library(dHSIC) # Hilbert Schmidt Independence Criterion
library(VineCopula) # Blomqvist's Beta

source("data_generation.R")
my_beta <- function(x, y){
    # following Blomqvist, N. (1950).  On a measure of dependence between two
    # random variables. The Annals of Mathematical Statistics, 21(4), 593-600.
    mx <- median(x); my <- median(y)
    n1 <- sum(x < mx & y > my) +  sum(x > mx & y < my)
    n2 <- sum(x < mx & y < my) +  sum(x > mx & y > my)
    
    return ((n1 - n2)/(n1 + n2))
 
}

my_XICOR <- function(x, y){
  return(max(XICOR::calculateXI(x,y), XICOR::calculateXI(y,x)))
}
generate_quadratic <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -1, max = 1)
    y <- x^2
    c(x,y)
  }))
}

generate_quadratic2 <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -1, max = 1)
    y <- x^2 + stats::rnorm(1, mean = 0, sd = 0.3)
    c(x,y)
  }))
}

gen_vertical1 <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -2, max = 2)
    y <- 0 
    c(x,y)
  }))
}

gen_vertical2 <- function(n){
  t(sapply(1:n, function(i){
    x <- 0
    y <- stats::runif(1, min = -2, max = 2)
    c(x,y)
  }))
}

generate_power <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = 0, max = 2)
    y <- x^10 + stats::rnorm(1, mean = 10, sd = 10)
    c(x,y)
  }))
}

gen_linear <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    x <- stats::rnorm(1, mean = 2.5, sd = 1)
    y <- x + stats::rnorm(1, mean = 0, sd = 1)
    c(x,y)
  }))
}

gen_strong_linear <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    x <- stats::rnorm(1, mean = 2.5, sd = 1)
    y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    c(x,y)
  }))
}

gen_sine <- function(n){
  t(sapply(1:n, function(i){
    x <- stats::runif(1, min = -2, max = 12)
    y <- sin(x) + stats::rnorm(1, mean = 0, sd = 0.1)
    c(x,y)
  }))
}
quadratic_1 <- generate_quadratic(100)
plot(quadratic_1)

exper_n <- 500

ver1 <- gen_vertical1(exper_n)
plot(ver1)

ver2 <- gen_vertical2(exper_n)
plot(ver2)

cross <- rbind(ver1, ver2)
plot(cross)

power <- generate_power(exper_n)
plot(power)

quadratic2 <- generate_quadratic2(400)
plot(quadratic2)

sine <- gen_sine(exper_n)
plot(sine)

Dcor vs Pearson

plot(quadratic_1)

energy::dcor(quadratic_1[,1],quadratic_1[,2])
## [1] 0.498928
stats::cor(quadratic_1, method = "pearson")
##             [,1]        [,2]
## [1,]  1.00000000 -0.06924509
## [2,] -0.06924509  1.00000000
#stats::cor(quadratic_1, method = "spearman")

Things to consider

lollipop <- .generate_lollipop(500)
plot(lollipop)

stats::cor(lollipop)
##          [,1]     [,2]
## [1,] 1.000000 0.861068
## [2,] 0.861068 1.000000
energy::dcor(lollipop[,1],lollipop[,2])
## [1] 0.8933179

XI vs Spearman

Linear

lin <- gen_linear(200)
plot(lin)

my_XICOR(lin[,1], lin[,2])
## [1] 0.3096077
stats::cor(lin, method = "spearman")
##           [,1]      [,2]
## [1,] 1.0000000 0.6866872
## [2,] 0.6866872 1.0000000

Quadratic

my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941
stats::cor(quadratic_1, method = "spearman")
##             [,1]        [,2]
## [1,]  1.00000000 -0.01716172
## [2,] -0.01716172  1.00000000

Sine

my_XICOR(sine[,1], sine[,2])
## [1] 0.8267913
stats::cor(sine, method = "spearman")
##            [,1]       [,2]
## [1,]  1.0000000 -0.0544591
## [2,] -0.0544591  1.0000000

Hoeffding’s D vs XI

quadratic

Hmisc::hoeffd(sine[,1], sine[,2])$D
##            x          y
## x 1.00000000 0.05135241
## y 0.05135241 1.00000000
my_XICOR(sine[,1], sine[,2])
## [1] 0.8267913
plot(quadratic_1)

Hmisc::hoeffd(quadratic_1[,1], quadratic_1[,2])$D
##           x         y
## x 1.0000000 0.2391778
## y 0.2391778 1.0000000
my_XICOR(quadratic_1[,1], quadratic_1[,2])
## [1] 0.9411941

Quadratic with noise

plot(quadratic2)

Hmisc::hoeffd(quadratic2[,1], quadratic2[,2])$D
##            x          y
## x 1.00000000 0.04724254
## y 0.04724254 1.00000000
my_XICOR(quadratic2[,1], quadratic2[,2])
## [1] 0.3513022

Beta vs Spearman

steep exponential

plot(power)

stats::cor(power, method = "spearman")
##           [,1]      [,2]
## [1,] 1.0000000 0.6978712
## [2,] 0.6978712 1.0000000
my_beta(power[,1], power[,2])
## [1] -0.48
gen_quadrant <- function(n){
  t(sapply(1:n, function(i){
    rand_int <- sample(1:4, 1)
    if (rand_int == 1){
      x <- stats::runif(1, min = 0, max = 3)
      y <- stats::runif(1, min = 0, max = 3)
      
    } else if (rand_int == 2){
      x <- stats::runif(1, min = 0, max = 4)
      y <- stats::runif(1, min = 10, max = 14)
      
    } else if (rand_int == 3){
      x <- stats::runif(1, min = 10, max = 14)
      y <- stats::runif(1, min = 0, max = 4)
      
    } else {
      x <- stats::runif(1, min = 10, max = 14)
      y <- stats::runif(1, min = 10, max = 14)
    }
    c(x,y)
  }))
}

quadrant <- gen_quadrant(exper_n)

mean(quadrant[, 1])
## [1] 6.540044
mean(quadrant[, 2])
## [1] 6.86236
plot(quadrant)

Beta vs MI

my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
mi.empirical(entropy::discretize2d(as.matrix(quadrant[, 1]), as.matrix(quadrant[, 2]), numBins1 = 20, numBins2 = 20))
## Warning in KL.plugin(freqs2d, freqs.null, unit = unit): Vanishing value(s) in
## argument freqs2!
## [1] 0.1956087

Beta vs Kendall

plot(quadrant)

my_beta(quadrant[, 1], quadrant[, 2])
## [1] -0.104
stats::cor(quadrant, method = "kendall")
##            [,1]       [,2]
## [1,] 1.00000000 0.07547896
## [2,] 0.07547896 1.00000000
#minerva::mine(quadrant)$MIC
plot(lin)

my_beta(lin[, 1], lin[, 2])
## [1] -0.46
stats::cor(lin, method = "kendall")
##           [,1]      [,2]
## [1,] 1.0000000 0.4977889
## [2,] 0.4977889 1.0000000
#minerva::mine(quadrant)$MIC

MIC vs MI

plot(quadratic_1)

minerva::mine(quadratic_1)$MIC
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
mi.empirical(entropy::discretize2d(as.matrix(quadratic_1[, 1]), as.matrix(quadratic_1[, 2]), numBins1 = 10, numBins2 = 10))
## [1] 1.244338

2. Explore more about the lollipop

2-1. Original lollipop

original <- .generate_lollipop
set.seed(11)
lollipop <- original(500)
plot(lollipop)

stats::cor(lollipop, method = "pearson")[1,2]
## [1] 0.8536084
energy::dcor(lollipop[,1],lollipop[,2])
## [1] 0.8884833
my_XICOR(lollipop[,1], lollipop[,2])
## [1] 0.6018384
stats::cor(lollipop, method = "spearman")[1,2]
## [1] 0.8496716
Hmisc::hoeffd(lollipop[,1], lollipop[,2])$D
##           x         y
## x 1.0000000 0.4040181
## y 0.4040181 1.0000000
my_XICOR(lollipop[,1], lollipop[,2])
## [1] 0.6018384
stats::cor(lollipop, method = "spearman")[1,2]
## [1] 0.8496716
my_beta(lollipop[, 1], lollipop[, 2])
## [1] -0.888
my_beta(lollipop[, 1], lollipop[, 2])
## [1] -0.888
stats::cor(lollipop, method = "kendall")[1,2]
## [1] 0.6607615

2-2. Three clusters in total

new_lollipop1 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:3, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    
      # generate "bridge"
    } else if (k == 2){
      x <- stats::rnorm(1, mean = 1, sd = 0.5)
      y <- stats::rnorm(1, mean = 2, sd = 0.5)
      
    } else {
      # k == 3, generate "stick"
      x <- stats::rnorm(1, mean = 3, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
    } 
    
    c(x,y)
  }))
}
set.seed(10)
new_pop1 <- new_lollipop1(500)
plot(new_pop1)

stats::cor(new_pop1, method = "pearson")[1,2]
## [1] 0.8219725
energy::dcor(new_pop1[,1],new_pop1[,2])
## [1] 0.8336293
my_XICOR(new_pop1[,1], new_pop1[,2])
## [1] 0.5233341
stats::cor(new_pop1, method = "spearman")[1,2]
## [1] 0.8256502
Hmisc::hoeffd(new_pop1[,1], new_pop1[,2])$D
##           x         y
## x 1.0000000 0.3500869
## y 0.3500869 1.0000000
my_XICOR(new_pop1[,1], new_pop1[,2])
## [1] 0.5233341
stats::cor(new_pop1, method = "spearman")[1,2]
## [1] 0.8256502
my_beta(new_pop1[, 1], new_pop1[, 2])
## [1] -0.584
my_beta(new_pop1[, 1], new_pop1[, 2])
## [1] -0.584
stats::cor(new_pop1, method = "kendall")[1,2]
## [1] 0.6452745

2-3. Two clusters that continously connect the circle and stick

set.seed(10)
new_lollipop2 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:2, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
    }
        
    c(x,y)
  }))
}
new_pop2 <- new_lollipop2(500)
plot(new_pop2)

stats::cor(new_pop2, method = "pearson")[1,2]
## [1] 0.6320267
energy::dcor(new_pop2[,1],new_pop2[,2])
## [1] 0.6309555
my_beta(new_pop2[,1], new_pop2[,2])
## [1] -0.528
stats::cor(new_pop2, method = "spearman")[1,2]
## [1] 0.6461776
Hmisc::hoeffd(new_pop2[,1], new_pop2[,2])$D
##           x         y
## x 1.0000000 0.1889667
## y 0.1889667 1.0000000
my_beta(new_pop2[,1], new_pop2[,2])
## [1] -0.528
stats::cor(new_pop2, method = "spearman")[1,2]
## [1] 0.6461776
my_beta(new_pop2[,1], new_pop2[,2])
## [1] -0.528
my_beta(new_pop2[,1], new_pop2[,2])
## [1] -0.528
stats::cor(new_pop2, method = "kendall")[1,2]
## [1] 0.4778998

2-4.The lollipop that has a very long stick

set.seed(15)
new_lollipop3 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:2, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
      
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 6, sd = 4)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.5)
    }
        
    c(x,y)
  }))
}
new_pop3 <- new_lollipop3(500)
plot(new_pop3)

stats::cor(new_pop3, method = "pearson")[1,2]
## [1] 0.96866
energy::dcor(new_pop3[,1],new_pop3[,2])
## [1] 0.9701998
my_beta(new_pop3[,1], new_pop3[,2])
## [1] -0.736
stats::cor(new_pop3, method = "spearman")[1,2]
## [1] 0.8447599
Hmisc::hoeffd(new_pop3[,1], new_pop3[,2])$D
##         x       y
## x 1.00000 0.45858
## y 0.45858 1.00000
my_beta(new_pop3[,1], new_pop3[,2])
## [1] -0.736
stats::cor(new_pop3, method = "spearman")[1,2]
## [1] 0.8447599
my_beta(new_pop3[,1], new_pop3[,2])
## [1] -0.736
my_beta(new_pop3[,1], new_pop3[,2])
## [1] -0.736
stats::cor(new_pop3, method = "kendall")[1,2]
## [1] 0.6949579

2-5. The lollipop that has less longer stick

set.seed(230)
new_lollipop4 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 0.5)
      y <- stats::rnorm(1, mean = 1, sd = 0.5)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 1, sd = 1)
      y <- x + stats::rnorm(1, mean = 1, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop4 <- new_lollipop4(500)
plot(new_pop4)

stats::cor(new_pop4, method = "pearson")[1,2]
## [1] 0.8282317
energy::dcor(new_pop4[,1],new_pop4[,2])
## [1] 0.808789
my_XICOR(new_pop4[,1], new_pop4[,2])
## [1] 0.4473378
stats::cor(new_pop4, method = "spearman")[1,2]
## [1] 0.729348
Hmisc::hoeffd(new_pop4[,1], new_pop4[,2])$D
##           x         y
## x 1.0000000 0.2651078
## y 0.2651078 1.0000000
my_XICOR(new_pop4[,1], new_pop4[,2])
## [1] 0.4473378
stats::cor(new_pop4, method = "spearman")[1,2]
## [1] 0.729348
my_beta(new_pop4[ ,1], new_pop4[, 2])
## [1] -0.552
my_beta(new_pop4[ ,1], new_pop4[, 2])
## [1] -0.552
stats::cor(new_pop4, method = "kendall")[1,2]
## [1] 0.5559118

2-6. The lollipop that has less dense stick in the circle

set.seed(230)
new_lollipop5 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- stats::rnorm(1, mean = 0, sd = 1)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 0, sd = 1)
      y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop5 <- new_lollipop5(500)
plot(new_pop5)

stats::cor(new_pop5, method = "pearson")[1,2]
## [1] 0.4485494
energy::dcor(new_pop5[,1],new_pop5[,2])
## [1] 0.4538884
my_XICOR(new_pop5[,1], new_pop5[,2])
## [1] 0.1725967
stats::cor(new_pop5, method = "spearman")[1,2]
## [1] 0.4548433
Hmisc::hoeffd(new_pop5[,1], new_pop5[,2])$D
##            x          y
## x 1.00000000 0.09886414
## y 0.09886414 1.00000000
my_XICOR(new_pop5[,1], new_pop5[,2])
## [1] 0.1725967
stats::cor(new_pop5, method = "spearman")[1,2]
## [1] 0.4548433
my_beta(new_pop5[ ,1], new_pop5[, 2])
## [1] -0.432
my_beta(new_pop5[ ,1], new_pop5[, 2])
## [1] -0.432
stats::cor(new_pop5, method = "kendall")[1,2]
## [1] 0.3461162

2-7. The lollipop that has very dense stick in the circle

set.seed(230)
new_lollipop6 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(c(1,2), 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0, sd = 2)
      y <- stats::rnorm(1, mean = 0, sd = 2)
    } else {
      # k == 2, generate "stick"
      x <- stats::rnorm(1, mean = 0, sd = 0.5)
      y <- x + stats::rnorm(1, mean = 0, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop6 <- new_lollipop6(600)
plot(new_pop6)

stats::cor(new_pop6, method = "pearson")[1,2]
## [1] 0.03037599
energy::dcor(new_pop6[,1],new_pop6[,2])
## [1] 0.265397
my_XICOR(new_pop6[,1], new_pop6[,2])
## [1] 0.159381
stats::cor(new_pop6, method = "spearman")[1,2]
## [1] 0.2145965
Hmisc::hoeffd(new_pop6[,1], new_pop6[,2])$D
##            x          y
## x 1.00000000 0.06169904
## y 0.06169904 1.00000000
my_XICOR(new_pop6[,1], new_pop6[,2])
## [1] 0.159381
stats::cor(new_pop6, method = "spearman")[1,2]
## [1] 0.2145965
my_beta(new_pop6[ ,1], new_pop6[, 2])
## [1] -0.3333333
my_beta(new_pop6[ ,1], new_pop6[, 2])
## [1] -0.3333333
stats::cor(new_pop6, method = "kendall")[1,2]
## [1] 0.2000779

2-8. The lollipop that has multiple clusters in the ball

set.seed(230)
new_lollipop7 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:5, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else if (k == 2) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 3) {
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 4) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else {
      # Otherwise, generate "stick"
      x <- stats::rnorm(1, mean = 1.5, sd = 0.8)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.5)
    }
    
    c(x,y)
  }))
}
new_pop7 <- new_lollipop7(700)
plot(new_pop7)

stats::cor(new_pop7, method = "pearson")[1,2]
## [1] 0.539343
energy::dcor(new_pop7[,1],new_pop7[,2])
## [1] 0.5047635
my_XICOR(new_pop7[,1], new_pop7[,2])
## [1] 0.2105576
stats::cor(new_pop7, method = "spearman")[1,2]
## [1] 0.3634762
Hmisc::hoeffd(new_pop7[,1], new_pop7[,2])$D
##            x          y
## x 1.00000000 0.05230268
## y 0.05230268 1.00000000
my_XICOR(new_pop7[,1], new_pop7[,2])
## [1] 0.2105576
stats::cor(new_pop7, method = "spearman")[1,2]
## [1] 0.3634762
my_beta(new_pop7[ ,1], new_pop7[, 2])
## [1] -0.2057143
my_beta(new_pop7[ ,1], new_pop7[, 2])
## [1] -0.2057143
stats::cor(new_pop7, method = "kendall")[1,2]
## [1] 0.2537911

2-9. The lollipop that has multiple clusters in the ball and more weights on stick than previous one

set.seed(230)
new_lollipop8 <- function(n){
  t(sapply(1:n, function(i){
    # generate "cluster" assignment
    k <- sample(1:6, 1)
    
    # generate "ball"
    if(k == 1){
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else if (k == 2) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 3) {
      x <- stats::rnorm(1, mean = 0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = -0.5, sd = 0.5)
    } else if (k == 4) {
      x <- stats::rnorm(1, mean = -0.5, sd = 0.5)
      y <- stats::rnorm(1, mean = 0.5, sd = 0.5)
    } else {
      # Otherwise, generate "stick"
      x <- stats::rnorm(1, mean = 1, sd = 1)
      y <- x + stats::rnorm(1, mean = 0.5, sd = 0.3)
    }
    
    c(x,y)
  }))
}
new_pop8 <- new_lollipop8(800)
plot(new_pop8)

stats::cor(new_pop8, method = "pearson")[1,2]
## [1] 0.6504005
energy::dcor(new_pop8[,1],new_pop8[,2])
## [1] 0.6132621
my_XICOR(new_pop8[,1], new_pop8[,2])
## [1] 0.3001927
stats::cor(new_pop8, method = "spearman")[1,2]
## [1] 0.5171377
Hmisc::hoeffd(new_pop8[,1], new_pop8[,2])$D
##           x         y
## x 1.0000000 0.1235858
## y 0.1235858 1.0000000
my_XICOR(new_pop8[,1], new_pop8[,2])
## [1] 0.3001927
stats::cor(new_pop8, method = "spearman")[1,2]
## [1] 0.5171377
my_beta(new_pop8[ ,1], new_pop8[, 2])
## [1] -0.32
my_beta(new_pop8[ ,1], new_pop8[, 2])
## [1] -0.32
stats::cor(new_pop8, method = "kendall")[1,2]
## [1] 0.3823467